*****************************************************************
* Replication directory for                                   ***
* Prime locations                                             ***
* by Gabriel M. Ahlfeldt, Thilo N.H. Albers, Kristian Behrens ***
* Published in American Economic Review: Insights             ***
*****************************************************************
* 01/2025
* Stata
version 17.0

* This meta do file 
* defines the root directory for the analysis in Stata / Python
* be called by other Stata scripts. 
* defines relative paths and generates subdirectories for outputs
* calles all other Stata files that generate intermediate and final outputs

* Define users and root directories	
* Please replace the below directories with your own paths; we have created three users such that you can switch between machines 
	* User 1 Windows server
	* User 2 Linux server
	* User 3 Mac Desktop
	global user =3 
*set global root here: All other folders will be created within the unzipped directory  	
 	if $user == 1 {
		global root  	"H:\Replication_PrimeLocations"
		python set exec   	"C:\ProgramData\Anaconda3\python.exe" , permanently
		set python_userpath "C:\Users\alberstn\AppData\Roaming\Python\Python311", permanently 		
		}
	if $user == 2 {
		global root "/usr/net/primelocations/Detecting Prime Locations"
	}
	
	* Mac
	if $user == 3 {
		global root "/Users/talbe_01/Dropbox/Data and Replication/PL 2026_FEB_20/dataverse_files"
		python set exec "/opt/anaconda3/bin/python"
		set python_userpath ""
	}
	cd "$root" // Changing the working directory to the root folder
	
	* Notice that some of the codes will require a Python environment set up on your machine
	
	* We recommend the Anaconda distribution, available at  https://www.anaconda.com/products/distribution
	* We have used the vollowing version: 3.11.7 (64-bit architecture)

	* From the Anacdona promt, install the relevant Python packages using the command: "pip install shapely pandas geopandas numpy tqdm scipy"
	* We have used the following versions
	* Shapely: 2.0.6
	* Pandas: 2.1.4
	* GeoPandas: 1.0.1
	* NumPy: 1.26.4
	* tqdm: 4.65.0
	* SciPy: 1.11.4
	* 
	
	* Define if python should be run or not (i.e. do you have a pyhton environment installed.
		* If yes, set to 1. If not, set to 0 and unzip folder TEMP in the root directory in to _Work directory)
	global runpython_scripts=0
	
	
	* This installs the python dependencies (this works on MacOs and linux, but in windows server arctitercures you may have to install them manually via the shell)
	if   $runpython_scripts{
		shell pip3 install  shapely pandas geopandas numpy tqdm scipy
	 }

	
* Define various macros that manage origin folders for inputs and destination folders for outputs
	global scripts "$root/_Scripts"
	global temp "$root/_Work/TEMP"
	global data "$root/_Data"
	global data_3USCITIES "$root/_Data/3 US CITIES"
	global data_USMETROS "$root/_Data/US METRO DATA"
	global data_125cities "$root/_Data/125 GLOBAL CITIES"
	global tables "$root/_Work/Output/Tables"
	global tables_App "$root/_Work/Output/Tables_Appendix"
	global figures "$root/_Work/Output/Figures"
	global figures_App "$root/_Work/Output/Figures_Appendix"	
	global dataoutput "$root/_Work/Output/Data"
	global shapeoutput "$root/_Work/Output/Shapes"	
	global log "$root/_Work/log"		
	
* Create necessary folders	
	capture mkdir "$root/_Work/Output"
	capture mkdir "$temp"
	capture mkdir "$tables"
	capture mkdir "$tables_App"
	capture mkdir "$figures"
	capture mkdir "$figures_App"
	capture mkdir "$dataoutput"
	capture mkdir "$shapeoutput"
	capture mkdir "$log"
	
	
* Unpacking zipped files

	
	* unzip data files for global dataset
	
		local unzip_USdata `" "BlockData"  "GIS Data"  "HeightGap" "HistoricMfg" "Housing" "LandUseRegulation" "PL_emp20km" "Raw Numeric Data"  "CLUSTER OUTPUT" "'

		cd "$data/US METRO DATA"
		foreach x of local unzip_USdata{
			display "`x'"
			unzipfile "`x'.zip"
		}
	
	
	* unzip data files for global dataset 
	local unzip_globaldata `" "COWORKING" "EMPORIS" "GEO DATA" "GIS" "METRO_LEVEL_COVARIATES" "OVERID" "PHOTOS" "PL GLOBAL CITY DATASET" "POP" "STARBUCKS_SNL" "TWITTER" "'

		cd "$data/125 GLOBAL CITIES"
		foreach x of local unzip_globaldata{
			display "`x'"
			unzipfile "`x'.zip"
		}
		
	* unzip TEMP folder if Python is not run 
		
		 cd "$temp"
		
		if $runpython_scripts == 0{
			forvalues x=1/7{
				unzipfile "TEMP_ARCHIVE`x'.zip"
			}
			}
		
		cd "$root"
* Load packages from ssc 
	net install scheme-modern, from("https://raw.githubusercontent.com/mdroste/stata-scheme-modern/master/")
	net install grc1leg, from (http://www.stata.com/users/vwiggins)
	ssc install distinct
	ssc install require
	ssc install texsave
	ssc install spmap
	ssc install shp2dta 
	ssc install filelist
	ssc install geonear
	net install estout, replace from(https://raw.githubusercontent.com/benjann/estout/master/)
	ssc install ineqdeco, replace
	ssc install ftools, replace
	ssc install reghdfe, replace
	ssc install rangejoin
	ssc install rangestat
	set scheme modern
	ssc install nnls 
	ssc install texsave	
				

* Prelims 	
	* Define Scale factor that converts  ruler distances into geodesic distances
		global bmd  cos(2*atan(exp([grid_y^2^0.5]/6371000))-_pi/2)
	
	* Select global parameter values for PL identification
		global PLtresh = 0.05 				// Cluster threshold, min employment relative to largest PL in city
		global DISTtresh = 2500 			// Aggregation distance, Min distance between PL in city in meters
		global SplitDist = 1000 			// Split distance, distance beyond which final PLs are split (in meters)
		global DisconnectDISTthresh = 5000 	// If vertical or horizontal size is greater than that, conisider disconnecting prime location and splitting it in two (in meters)
			
	* Load custom programs 
		do "$scripts/1_DefinePrograms.do"		 

* Set seed 
	set seed 1251368

**********************************************
* All US METROS  (NETS microgeographic data) *
**********************************************		
log using "$log/log.log", replace

			* Prepare metro-grid legend
			do "$scripts/3a_0_GenerateLegend"
					
			* Define list of p values to be evaluated
			global plist = "98_5 99_0  99_9 99_5" 
				
			* Delineate PLs, nested in a loop over p-values
			foreach level in $plist {   
			global SL = "`level'"

			*  Process outputs of C++ clustering aglorithm and prepare metro-grid data with employment and clusterID for deliniation of PLs
			do "$scripts/3a_2_CompileEmploymentGrid_CBSAs_pval"	// by p-value	
			
 			* Generate PLs for US CBSAs		
			do "$scripts/3a_3_GeneratePLs" // by p-value
			
			* Describe prime locations
			do "$scripts/3b_1_SummaryStats_PLs_US_A" // by p-value	
			
			* Describe CBSAs
			do "$scripts/3b_1_SummaryStats_PLs_US_B" // by p-value				
			}
			* Loop over p-values ends
		
			* Compute distances and RDD plots
			do "$scripts/3b_3_CalculateDistancesPLs"		// Compute distances from PL boundaries for BDD analysis
			
			* Generate BDD plots using distances
			do "$scripts/3c_3_ValidateThreshold"			// Generate BDD plot by p-value
			
			* Settle on preferred p-value			
			global SL = "99_5" // Preferred p-value based on BDD analysis
			
			*  Select results for preferred p-value as main results and copy to output folders
			do "$scripts/3c_BaselineResults.do"					
			
	
			* Generate grid shapefiles with identfiers for mapping
				if $runpython_scripts{
			do "$scripts/3d_GenShapes4Mapping" 				// calls Python Geopandas
				}
				
			* Map prime location boundaries against employment density background
			do "$scripts/3d_MAPPING.do"						
			
					

			* Generate grid shapes with employmente by sector for toolkit
				if $runpython_scripts{
			do "$scripts/3d_GenGrids4toolkit.do"			// calls Python Geopandas
				}
				
			* Generates prime location boundary shapefiles for toolkit
				if $runpython_scripts{
			do "$scripts/3e_GenPLshapes.do"					// calls Python Geopandas
				}
			* Compute distance from block groups to prime locations
			if $runpython_scripts{
			do "$scripts/3f_GenBlockGroup2PLdist.do"		// This do file calls Python Geopandas
			}
															// Notice that you can call the Python script with the same name in Python for a parallelized appraoch that saves time
		
		
			* Use prime location distance and block group data to compute accessibility measures
			do "$scripts/3g_BlockGroupAnalysis.do"
			
			* Do PL accessibility analysis 
			do "$scripts/3h_PLaccessibility.do"
			
			* Generate PL summary table spatial structure output 
			if $runpython_scripts{
				do "$scripts/3h_PLsummaryTable.do"				// This do file requires the PLemp20km file, which is generated from C++ code using Prime Locations as input
			}
				
			* Compute the shares of ZCTA area covered by prime locations
			if $runpython_scripts{
				do "$scripts/3i_Validation_CBP_datagen.do"		// calls Python Geopandas
			}
			
			*  Validate deliniated prime locations against CBP data
			do "$scripts/3j_Validation_CBP.do"					 
			
		
********************************************************************************		
* Generation and validation of scraped employment (NETS and scraped POIs data) *
********************************************************************************

		* Generate populations for overlap of Global City dataset and US MSA sample 
			do "$scripts/4a_0_POP_MSAs.do"
		
		* This creates the match of the prime points to the new US METRO grids
			do "$scripts/4a_1_MATCH_PrimePointsToGrid.do"	

		* Compile data for estimation of weights
			do "$scripts/4a_2_CompilingWeights.do"

		* Estimate employment weights based on 
			* all cities in CBSA in and Global Cities samples (for Global Cities analysis)
			* the first half by alphabet (for validation)
			* the second half by alphabet (for validation)
			do "$scripts/4a_3_Weights_CBSAs.do"
			* These outputs are input into the C++ clustering algorithm

		*  Process outputs of C++ clustering aglorithm and prepare metro-grid data with employment and clusterID for deliniation of PLs
			do "$scripts/4b_1_CompileEmploymentGrid_CBSA_validations"		// by employment type			

		* Generate prime locations by employment type
			foreach EmpType in 99 3 5 {
			global ET = "`EmpType'"
			do "$scripts/4b_2_GeneratePLs_val"		
			}		
			
		* Process outputs of C++ clustering aglorithm and prepare metro-grid data with employment and clusterID for deliniation of PLs (validation subsets)
			do "$scripts/4b_3_CompileEWPPfirst"			// This one uses weights from the first half of MSAs to predict PLs for the second half
			do "$scripts/4b_4_CompileEWPPlast"			// This one uses weights from the second half of MSAs to predict PLs for the first half
				
		* Generate prime locations for validation subsamples	
			do "$scripts/4b_5_GeneratePLs_val_first"	// This one uses weights from the first half of MSAs to predict PLs for second half	
			do "$scripts/4b_6_GeneratePLs_val_last"		// This one uses weights from the second half of MSAs to predict PLs for first half
			
		* Correlate prime locations for validation
			do "$scripts/4c_1_Validation.do"

*****************************************************************************
* Prepare 125 city sample (Scraped POIs - No NETS DATA EXCEPT FOR WEIGHTS ) *
*****************************************************************************

* a. Basic data preparation 

	* Tabulate big data establishments
		do "$scripts/5a_0_Establishments.do"										
	
	* Process outputs of C++ clustering aglorithm and prepare
		do "$scripts/5a_1_Compile125cityDataset.do"										
	
* b. Create prime locations and produce basic descriptives	
	
 	* Generate prime locations for 125 global cities		
		do "$scripts/5b_1_DelineatePLs_125cityDataset.do"								// Aggregates and delineates PLs
	
	* Produce summary statistics and data table	
		do "$scripts/5b_2_SummaryStats_125CitiesPLs.do"									// Creates  summary stats graph
	* Map prime locations	
		do "$scripts/5b_3_MAPPING.do"													// Maps prime locations in global cities
	* Generate prime locations shapefiles	
				if $runpython_scripts{
		do "$scripts/5b_3_GenPLShapes.do"												// Generate prime location shapefiles calling Python Geopandas
				}
				
	* Gen grid data shapefiles
		do "$scripts/5b_4_GengGridDataShapes"
		
* c. Gradient analysis		
	
	* Generate distance from prime location borders
		do "$scripts/5c_1_CalculateDistanceToPLBorder.do"								// generates distance to PL border

	* Esitmate and illustrate gradients	
		do "$scripts/5c_2_Gradients.do"													// Gradients by city structure and NA vs. ROW

* d. Variation between cities 
		do "$scripts/6d_1_CorrelatesAndSubadditivity.do"								// Regressions and Subadditivity Graph	

* e. Further validation of global cities big data
		do "$scripts/6e_1_125CitiesValidation.do"										// Generate over ID tests of big data establishments in global sample	

log close

* Script ends
			

